Hive复杂数据类型
array
array_contains(column, ‘value’)
hive_array.txt
1 2
| pk beijing,shanghai,tianjin,hangzhou jepson changchu,chengdu,wuhan,beijing
|
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
| create table hive_array(name string, work_locations array<string>) row format delimited fields terminated by '\t' COLLECTION ITEMS TERMINATED BY ',';
LOAD DATA LOCAL INPATH '/home/hadoop/data/hive_array.txt' into table hive_array;
select name,work_locations[0] from hive_array;
select name,size(work_locations) from hive_array;
select * from hive_array where array_contains(work_locations,'tianjin') ;
|
MAP
MAP<primitive_type, data_type> kv
hive_map.txt
1 2 3 4
| 1,zhangsan,father:xiaoming#mother:xiaohuang#brother:xiaoxu,28 2,lisi,father:mayun#mother:huangyi#brother:guanyu,22 3,wangwu,father:wangjianlin#mother:ruhua#sister:jingtian,29 4,mayun,father:mayongzhen#mother:angelababy,26
|
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
| create table hive_map( id int, name string, members map<string,string>, age int ) row format delimited fields terminated by ',' COLLECTION ITEMS TERMINATED BY '#' MAP KEYS TERMINATED BY ':' ;
LOAD DATA LOCAL INPATH '/home/hadoop/data/hive_map.txt' into table hive_map;
select id,name,members['father'] as father,members['mother'] as mother from hive_map;
select id,name,map_keys(members) as relations,age from hive_map;
select id,name,map_values(members) as relations,age from hive_map;
select id,name,size(members) from hive_map;
|
struct
structs: STRUCT<col_name : data_type [COMMENT col_comment], …>
内部的数据类型可以不同
192.168.1.1#zhangsan:40
hive_struct.txt
1 2 3 4
| 192.168.1.1#zhangsan:40 192.168.1.2#lisi:50 192.168.1.3#wangwu:60 192.168.1.4#zhaoliu:70
|
1 2 3 4 5 6 7 8 9 10 11 12 13
| create table hive_struct( ip string, info struct<name:string,age:int> ) row format delimited fields terminated by '#' COLLECTION ITEMS TERMINATED BY ':' ;
LOAD DATA LOCAL INPATH '/home/hadoop/data/hive_struct.txt' into table hive_struct;
select ip,info.name,info.age from hive_struct;
|
练习
click_log.txt
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
| 11 ad_101 2014-05-01 06:01:12.334+01 22 ad_102 2014-05-01 07:28:12.342+01 33 ad_103 2014-05-01 07:50:12.33+01 11 ad_104 2014-05-01 09:27:12.33+01 22 ad_103 2014-05-01 09:03:12.324+01 33 ad_102 2014-05-02 19:10:12.343+01 11 ad_101 2014-05-02 09:07:12.344+01 35 ad_105 2014-05-03 11:07:12.339+01 22 ad_104 2014-05-03 12:59:12.743+01 77 ad_103 2014-05-03 18:04:12.355+01 99 ad_102 2014-05-04 00:36:39.713+01 33 ad_101 2014-05-04 19:10:12.343+01 11 ad_101 2014-05-05 09:07:12.344+01 35 ad_102 2014-05-05 11:07:12.339+01 22 ad_103 2014-05-05 12:59:12.743+01 77 ad_104 2014-05-05 18:04:12.355+01 99 ad_105 2014-05-05 20:36:39.713+01
|
ad_list.txt
1 2 3 4 5
| ad_101 http://www.google.com catalog8|catalog1 ad_102 http://www.sohu.com catalog6|catalog3 ad_103 http://www.baidu.com catalog7 ad_104 http://www.qq.com catalog5|catalog1|catalog4|catalog9 ad_105 http://sina.com
|
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
| create table ad_list(ad_id string,url string,catalogs string)row format delimited fields terminated by '\t';
人 访问的所有ad_id 去重 select cookie_id, collect_set(ad_id) from click_log group by cookie_id; 不去重 select cookie_id, collect_set(ad_id) from click_log group by cookie_id;
select click.cookie_id,click.ad_id,click.amount,ad_list.catalogs from (select cookie_id, ad_id, count(1) amount from click_log group by cookie_id, ad_id) click join ad_list on ad_list.ad_id = click.ad_id;
针对ad_list.txt 中的catalogs列转行 ad_101 catalog8|catalog1 ==> ad_101 catalog8 ad_101 catalog1
select ad_id, catalog from ad_list lateral view outer explode(split(catalogs,'\\|')) t as catalog;
注意:\\ 转义字符 lateral view outer explode(split(catalogs,'\\|')) t as catalog
将catalogs字段转换为数组可以对其内部排序 create table ad_list_2(ad_id string,url string,catalogs array<string>)row format delimited fields terminated by '\t' COLLECTION ITEMS TERMINATED BY '|'; select ad_id,sort_array(catalogs) from ad_list_2;
|